In [9]:
import pandas as pd
import numpy as np 
import glob
import datetime 
import time

Reading all the files


In [10]:
allFiles = glob.glob("./csv/*.csv")

jobs = pd.read_csv('./csv/stackoverflow_jobs_macbook_20160912.csv', index_col='jobid',header=0)

dataframes = []
for file in allFiles: 
    df = pd.read_csv(file, index_col='jobid', header=0)
    dataframes.append(df)
    
merged_jobs = pd.concat(dataframes)

Dropping duplicates


In [11]:
print "Before de-deuplication count is " + str(len(merged_jobs.index))
merged_jobs.drop_duplicates(keep='last', inplace=True)
print "After de-deuplication count is " + str(len(merged_jobs.index))


Before de-deuplication count is 44860
After de-deuplication count is 14188

In [12]:
timestr = time.strftime("%Y%m%d")
out_path = "./csv_out/jobs_merged_" + timestr + "_" + str(len(merged_jobs.index))  + ".csv"


merged_jobs.to_csv(out_path,mode='w')

In [ ]: